setwd("/Users/lingechun/Dropbox/ukraine_public_opinion")
#setwd("/home/act/Dropbox/ukraine_public_opinion")

library(stringr)
library(ggplot2)
library(ggbreak) 
library(ggmosaic)
library(readxl)
library(cowplot)
library(tibble)
library(dplyr)



################## RA: Lizzie ##########################
coded <- read_excel("Coding/ukraine-topics-09.30.2022-lizzie.xlsx")
coded$topic <- tolower(coded$topic)
coded$topic <- str_squish(coded$topic)
table(coded$topic)


# Correct typos in labels
coded$topic[coded$topic==4109] <- "ukraine support"
coded$topic[coded$topic=="dometic politics"] <- "domestic politics"
coded$topic[coded$topic=="domestic politiics"] <- "domestic politics"
coded$topic[coded$topic=="domestic poitics - biden"] <- "domestic politics - biden"
coded$topic[coded$topic=="domestic politics - bidnen"] <- "domestic politics - biden"
coded[814, "topic"] <- "domestic politics - trump"
coded$topic[coded$topic=="f"] <- "foreign policy"
coded$topic[coded$topic=="misinformatiom"] <- "misinformation"
coded$topic[coded$topic=="misinfromation"] <- "misinformation"
coded$topic[coded$topic=="u"] <- "ukraine support"
coded$topic[coded$topic=="riussia support"] <- "russia discourse"
coded$topic[coded$topic=="russia support"] <- "russia discourse"
coded[2699, "topic"] <- "russia discourse"

# Combine Domestic Politics
coded$topic[coded$topic=="domestic politics - biden"] <- "domestic politics"
coded$topic[coded$topic=="domestic politics - trump"] <- "domestic politics"

coded$topic <- str_to_title(coded$topic)
table(coded$topic)
length(table(coded$topic))

################## RA: Josh ##########################
coded2 <- read_excel("Coding/ukraine-topics-09.30.2022-josh.xlsx")
coded2$topic <- tolower(coded2$topic)
coded2$topic <- str_squish(coded2$topic)
table(coded2$topic)


# Correct typos in labels
coded2$topic[coded2$topic=="domestic politics: biden"] <- "domestic politics - biden"
coded2$topic[coded2$topic=="domestic politics: trump"] <- "domestic politics - trump"


# Combine Domestic Politics
coded2$topic[coded2$topic=="domestic politics - biden"] <- "domestic politics"
coded2$topic[coded2$topic=="domestic politics - trump"] <- "domestic politics"
coded2$topic[coded2$topic=="russia support"] <- "russia discourse"


coded2$topic <- str_to_title(coded2$topic)
table(coded2$topic)
length(table(coded2$topic))

colnames(coded)[2] <- "lizzie"
colnames(coded2)[2] <- "josh"
coded$josh <- coded2$josh

coded <- coded[, c("topic_id",  "lizzie",     "josh", "words",
                   "example1",  "example2",  "example3",
                   "example4",  "example5",  "example6",
                   "example7",  "example8",  "example9", "example10")]


# Check agreement rate between two RAs: 0.56
sum(coded$lizzie == coded$josh)/nrow(coded)

# Retain topics agreed by two RAs
agree <- subset(coded, lizzie == josh)
agree$recode <- agree$lizzie

data_dis1 <- read.csv("Coding/Disagreement/ukraine_topic_full_disagree_Sep_comb(Jin)2.csv")

data_dis2 <- read.csv("Coding/Disagreement/ukraine_topic_full_disagree_Sep_comb(Gechun)2.csv")


colnames(data_dis1)[4] <- "recode"
colnames(data_dis2)[4] <- "recode"

disagree <- rbind(data_dis1[c(1:1499),], data_dis2[c(1500:2716), ])

coded <- rbind(agree, disagree)
coded$recode[coded$recode=="Random"] <- "Miscellaneous"
coded$recode[coded$recode=="Russia Support"] <- "Russia Discourse"

##################### For full data (doc2vec deep learn)  #########################
part_topic <- read.csv("Analysis/partsian_topic_d2vfull_deep_include_elite0618_Oct.csv")

# Divid Groups -- Conservative, Moderate, Liberal
part_topic$Dem[is.na(part_topic$Dem)] <- 0
part_topic$Rep[is.na(part_topic$Rep)] <- 0
part_topic$partisan_DplusR <- part_topic$Dem + part_topic$Rep
part_topic$partisan_DminusR <- part_topic$Dem - part_topic$Rep

part_topic <- subset(part_topic, partisan_DplusR!=0)

part_topic$ideo <- ifelse(part_topic$partisan_DminusR < 0, "Conservative",  "Liberal")

part_topic$ideo[abs(part_topic$partisan_DminusR)/part_topic$partisan_DplusR < 0.5 ] <- "Moderate"


###################### Merge Meta Info with Labeled Topics ################
label <- coded[, c("topic_id", "recode")]
colnames(label)[1] <- "assigned_topic"
data <- merge(label, part_topic, by="assigned_topic")
author_ids_insample <- as.data.frame(unique(data$author_id))
colnames(author_ids_insample) <- "author_id"
#write.csv(author_ids_insample, "/Users/lingechun/Dropbox/ukraine_public_opinion/data/author_ids_insample.csv", 
#          row.names = F)

######################################################################
########################### Summary Statistics #######################
######################################################################

colnames(part_topic)
part_user <- part_topic[, c("author_id", "ideo", "Dem", "Rep", 
                            "partisan_DplusR",  "partisan_DminusR")]


N_tweet <- table(part_topic$ideo)
part_user <- part_user[!duplicated(part_user), ]
N_user <- table(part_user$ideo)
#Ave <- N_tweet/N_user




library(dplyr)
con_users <- subset(part_topic, ideo == "Conservative")%>% count(author_id)
lib_users <- subset(part_topic, ideo == "Liberal")%>% count(author_id)
mod_users <- subset(part_topic, ideo == "Moderate")%>% count(author_id)

Max <- c(max(con_users$n), max(lib_users$n), max(mod_users$n))
Min <- c(min(con_users$n), min(lib_users$n), min(mod_users$n))
Median <- c(median(con_users$n), median(lib_users$n), median(mod_users$n))
table_sum <- cbind(N_user, Median, Max, Min, N_tweet)
table_sum

table_sum[,1]/sum(table_sum[,1])
table_sum[,5]/sum(table_sum[,5])
sum(table_sum[,5])/sum(table_sum[,1])







#####################################################################
############ Chisquare Test of All Clusters Distribution ############
#####################################################################

## Chi-square test of homogeneity 
table_cluster <- table(data$assigned_topic, data$ideo)
table_cluster <- as.data.frame(table_cluster)
table_cluster <- reshape(table_cluster, idvar = "Var2", timevar = "Var1", direction = "wide")
rownames(table_cluster) <- c("Conservative", "Liberal", "Moderate")
table_cluster<- table_cluster[, -1]

# comparing three ideological groups
chisq.test(table_cluster, correct = T)

# comparing liberal and conservative
chisq.test(table_cluster[c(1,2), ], correct = T)

# comparing liberal and moderate
chisq.test(table_cluster[c(2,3), ], correct = T)

# comparing conservative and moderate
chisq.test(table_cluster[c(1,3), ], correct = T)


#topic_freq <- table(data$assigned_topic)/nrow(data)
#topic_prop <- prop.table(table(data$assigned_topic))
#topic_prop <- as.data.frame(topic_prop)

#ggplot(topic_prop, aes(x =Var1,y=Freq ))+geom_col()+ 
#  scale_x_discrete(breaks=seq(0, 99, 10))+
#  xlab("Index of Top 100 Micro-Topics") + ylab("Proportions in All Topics")+
#  ggsave(file="Analysis/Oct_figures_3groups/top_prop.pdf", 
#         width=6, height=8, dpi=10000)


#####################################################################
##################### Density Plots of All Clusters  ################
#####################################################################

con_data <- data[data$ideo=="Conservative",]
con_cluster <- ggplot(con_data, aes(x=assigned_topic)) +
  geom_bar(aes(y = ..prop..), fill="red") +
  xlab("Cluster Index")+ ylab("Conservative Tweets")+
  ylim(0, 0.01)

lib_data <- data[data$ideo=="Liberal",]
lib_cluster <- ggplot(lib_data, aes(x=assigned_topic)) +
  geom_bar(aes(y = ..prop..), fill="blue") +
  xlab("Cluster Index")+ ylab("Liberal Tweets")+
  ylim(0, 0.01)

mod_data <- data[data$ideo=="Moderate",]
mod_cluster <- ggplot(mod_data, aes(x=assigned_topic)) +
  geom_bar(aes(y = ..prop..), fill="green") +
  xlab("Cluster Index")+ ylab("Moderate Tweets")+
  ylim(0, 0.01)

plot_grid(plotlist=list(con_cluster, mod_cluster, lib_cluster), ncol=1, align='v')+
  ggsave(file="Analysis/Oct_figures_3groups/cluster_dis.pdf", 
         width=12, height=8, dpi=10000)
 



#######################################
##########   Top50 Cluster   ##########
#######################################

# (1a) Individual top50 distribution by ideology (normalize)

data50 <- subset(data, assigned_topic < 50)
con_topic50 <- subset(data50, ideo=="Conservative")
lib_topic50 <- subset(data50, ideo=="Liberal")
mod_topic50 <- subset(data50, ideo=="Moderate")


table_con50 <- as.data.frame(prop.table(table(con_topic50$assigned_topic))) 
colnames(table_con50) <- c("Micro_Topic", "Percentage")
table_con50$Number <- table(con_topic50$assigned_topic)
table_con50$User <- "Conservative"

table_lib50 <- as.data.frame(prop.table(table(lib_topic50$assigned_topic))) 
colnames(table_lib50) <- c("Micro_Topic", "Percentage")
table_lib50$Number <- table(lib_topic50$assigned_topic)
table_lib50$User <- "Liberal"

table_mod50 <- as.data.frame(prop.table(table(mod_topic50$assigned_topic))) 
colnames(table_mod50) <- c("Micro_Topic", "Percentage")
table_mod50$Number <- table(mod_topic50$assigned_topic)
table_mod50$User <- "Moderate"

table_all50 <- as.data.frame(prop.table(table(data50$assigned_topic))) 
colnames(table_all50) <- c("Micro_Topic", "Percentage")
table_all50$Number <- table(data50$assigned_topic)
table_all50$User <- "All"


table50 <- rbind(table_con50, table_lib50, table_mod50, table_all50)
table50$User <- factor(table50$User, levels = c("Conservative", "Moderate", "Liberal", "All"))

names50 <- read.csv("Coding/ukraine_topic_full top50.csv")[c(1:50), ]
label <- rep(names50$topic, 4)
table50$Label <- label

table50$Label <- factor(table50$Label, levels = rev(names50$topic))




ggplot(table50,                                    
       aes(y = Label,
           x = Percentage,
           fill = User)) +
  geom_bar(stat = "identity",
           position = "stack",
           width = 0.5) +
  theme(axis.text.y = element_text(angle = 0, size = 5),
        axis.text.x = element_text(angle = 0, size = 5),
        plot.title = element_text(hjust = 0.5),
        legend.position="none") +
  labs(fill="") + 
  facet_grid(. ~ User) +
  ggtitle(" ") +
  xlab("") + ylab(" ") +
  scale_x_continuous(labels = scales::percent) +
  ggsave(file="Analysis/Oct_figures_3groups/top50_ideo_normalize.pdf", 
         width=8, height=8, dpi=10000)





# (1b) Individual top50 distribution by ideology (without normalize)

con_topic <- subset(data, ideo=="Conservative")
lib_topic <- subset(data, ideo=="Liberal")
mod_topic <- subset(data, ideo=="Moderate")


table_con <- as.data.frame(prop.table(table(con_topic$assigned_topic))) 
colnames(table_con) <- c("Micro_Topic", "Percentage")
table_con$Number <- table(con_topic$assigned_topic)
table_con$User <- "Conservative"

table_lib <- as.data.frame(prop.table(table(lib_topic$assigned_topic))) 
colnames(table_lib) <- c("Micro_Topic", "Percentage")
table_lib$Number <- table(lib_topic$assigned_topic)
table_lib$User <- "Liberal"

table_mod <- as.data.frame(prop.table(table(mod_topic$assigned_topic))) 
colnames(table_mod) <- c("Micro_Topic", "Percentage")
table_mod$Number <- table(mod_topic$assigned_topic)
table_mod$User <- "Moderate"

table_all <- as.data.frame(prop.table(table(data$assigned_topic))) 
colnames(table_all) <- c("Micro_Topic", "Percentage")
table_all$Number <- table(data$assigned_topic)
table_all$User <- "All"


table <- rbind(table_con[1:50,], table_lib[1:50,], table_mod[1:50,], table_all[1:50,])
table$User <- factor(table$User, levels = c("Conservative", "Moderate", "Liberal", "All"))

names50 <- read.csv("Coding/ukraine_topic_full top50.csv")[c(1:50), ]
label <- rep(names50$topic, 4)
table$Label <- label

table$Label <- factor(table50$Label, levels = rev(names50$topic))



ggplot(table,                                    
       aes(y = Label,
           x = Percentage,
           fill = User)) +
  geom_bar(stat = "identity",
           position = "stack",
           width = 0.5) +
  theme(axis.text.y = element_text(angle = 0, size = 5),
        axis.text.x = element_text(angle = 0, size = 5),
        plot.title = element_text(hjust = 0.5),
        legend.position="none") +
  labs(fill="") + 
  facet_grid(. ~ User) +
  ggtitle(" ") +
  xlab("") + ylab(" ") +
  scale_x_continuous(labels = scales::percent) +
  ggsave(file="Analysis/Oct_figures_3groups/top50_ideo.pdf", 
         width=8, height=8, dpi=10000)





#######################################
########## Substantive Topic ##########
#######################################

# (1) Substantive Topic by Three Groups
data <- subset(data, recode != "Miscellaneous")

con_topic <- subset(data, ideo=="Conservative")
lib_topic <- subset(data, ideo=="Liberal")
mod_topic <- subset(data, ideo=="Moderate")

table_con <- as.data.frame(prop.table(table(con_topic$recode))) 
colnames(table_con) <- c("Substantive_Topic", "Percentage")
table_con$Number <- table(con_topic$recode)
table_con$User <- "Conservative"

table_lib <- as.data.frame(prop.table(table(lib_topic$recode))) 
colnames(table_lib) <- c("Substantive_Topic", "Percentage")
table_lib$Number <- table(lib_topic$recode)
table_lib$User <- "Liberal"

table_mod <- as.data.frame(prop.table(table(mod_topic$recode))) 
colnames(table_mod) <- c("Substantive_Topic", "Percentage")
table_mod$Number <- table(mod_topic$recode)
table_mod$User <- "Moderate"

table <- rbind(table_con, table_mod, table_lib)
write.csv(table, "data/Substantive_Topic_table.csv", row.names = F)


### Plot grouped bar chart ###

table$`Ideology Groups` <- factor(table$User, levels = c("Conservative", "Moderate", "Liberal"))

ggplot(table, aes(x = Substantive_Topic, y = Percentage, fill = `Ideology Groups`)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "",
       x = "Substantive Topic",
       y = "Proportion") + theme_minimal()  +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "top", legend.title = element_blank()) +
  
  ggsave(file="Analysis/Oct_figures_3groups/subst_bar_chart.pdf", 
         width=10, height=6, dpi=10000)






### Bootstrap Topic Proportions ###
boot_prop <- table[,c("Substantive_Topic", "User")]
for (i in c(1:10)) {
  print(i)
  data_boot <- sample(data, nrow(data), replace = T)
  
  con_topic <- subset(data_boot, ideo=="Conservative")
  lib_topic <- subset(data_boot, ideo=="Liberal")
  mod_topic <- subset(data_boot, ideo=="Moderate")
  
  table_con <- as.data.frame(prop.table(table(con_topic$recode))) 
  colnames(table_con) <- c("Substantive_Topic", "Percentage")
  table_con$Number <- table(con_topic$recode)
  table_con$User <- "Conservative"
  
  table_lib <- as.data.frame(prop.table(table(lib_topic$recode))) 
  colnames(table_lib) <- c("Substantive_Topic", "Percentage")
  table_lib$Number <- table(lib_topic$recode)
  table_lib$User <- "Liberal"
  
  table_mod <- as.data.frame(prop.table(table(mod_topic$recode))) 
  colnames(table_mod) <- c("Substantive_Topic", "Percentage")
  table_mod$Number <- table(mod_topic$recode)
  table_mod$User <- "Moderate"
  
  prop <- rbind(table_con, table_mod, table_lib)$Percentage
  boot_prop <- cbind(boot_prop, prop)
}

CIs <- c()
for(j in c(1:nrow(boot_prop))){
  numbers <- boot_prop[, c(3:2003)]
  CIs <- c(CIs, quantile(numbers,probs=c(0.025,0.975)))
}



####################################
###### Mosaic Visualization ########
####################################
data$ideo <- factor(data$ideo, levels = c("Conservative", "Moderate", "Liberal"))
mosaic_data <- as.data.frame(table(data$ideo, data$recode))

p <- 
  ggplot(mosaic_data %>% as_tibble()) +
  geom_mosaic(
    aes(weight = Freq, x = product(Var1), fill = Var2)
  ) 
p
p +geom_text(data = layer_data(p, 1) %>% 
               select(xmin:ymax) %>% 
               mutate(m.x = (xmin + xmax)/2, m.y =  (ymin + ymax)/2) %>% 
               select(m.x, m.y)  %>% 
               mutate(string = round(table$Percentage,3)),
             # set label locations to centers, set labels to strings
             aes(x = m.x, y = m.y, label = string))+
  labs(title='')+
  xlab(" ")+
  ylab(" ")+theme_mosaic()+
  theme(legend.position = "none")+
  ggsave(file="Analysis/Oct_figures_3groups/subst_mosaic_white.pdf", 
         width=8, height=10, dpi=10000)



  
## Stacked Plots ###
table$User <- factor(table$User, levels = c("Conservative", "Moderate", "Liberal"))
ggplot(table,                                     
       aes(x = Substantive_Topic,
           y = Percentage,
           fill = User)) +
  geom_bar(stat = "identity",
           position = "dodge",
           width = 0.5) + 
  theme(axis.text.y = element_text(angle = 0, size = 5),
        axis.text.x = element_text(angle = 0, size = 5),
        plot.title = element_text(hjust = 0.5),
        legend.position="bottom") +
  labs(fill="") +
  ggtitle(" ") +
  ylab("Topic Proportion by Ideology Group") + xlab("Substantive Topic") +
  ggsave(file="Analysis/Oct_figures_3groups/subst_ideo.pdf", 
         width=6, height=8, dpi=10000)


test_results <- as.data.frame(matrix(nrow = 1, ncol = 5))
colnames(test_results) <- c("Topic", "Prop. Con.", 
                            "Prop. Mod.", 
                            "Prop. Lib.",
                            "P-value")


conservative_n <- sum(subset(table, User=="Conservative")$Number)
moderate_n <- sum(subset(table, User=="Moderate")$Number)
liberal_n <- sum(subset(table, User=="Liberal")$Number)


for(i in unique(table$Substantive_Topic)){
  this_topic <- subset(table, Substantive_Topic==i)
  results <- prop.test(x = this_topic$Number, n = c(conservative_n, 
                                                    moderate_n,
                                                    liberal_n))
  new.row <- c(i, format(round(results$estimate[1],3), nsmall=3), 
               format(round(results$estimate[2],3),nsmall=3), 
               format(round(results$estimate[3],3),nsmall=3), 
               format(round(results$p.value, 3),nsmall=3))
  test_results <- rbind(test_results,new.row)
}

library(xtable)
print(xtable(test_results[-1,]), include.rownames=FALSE)



# (2) Density Plots of Each Topic by D-R


## (a) D-R
plots_DminR <- list()
i=0
for (topic in unique(data$recode)) {
  i = i +1
  theme_update(plot.title = element_text(hjust = 0.5))
  this_topic <- subset(data, recode == `topic`)
  p <- ggplot(this_topic, aes(x=partisan_DminusR)) +
    geom_density() +
    xlab("")+
    ggtitle(`topic`)
  plots_DminR[[i]] <- p
}


plot_grid(plotlist=plots_DminR, ncol=3, align='v')+
  ggsave(file="Analysis/Oct_figures_3groups/density_DminR.pdf", 
         width=12, height=8, dpi=10000)

## (b) D
plots_D <- list()
lib_data <- subset(data, ideo == "Liberal")
i=0
for (topic in unique(lib_data$recode)) {
  i = i +1
  theme_update(plot.title = element_text(hjust = 0.5))
  this_topic <- subset(lib_data, recode == `topic`)
  p <- ggplot(this_topic, aes(x=Dem)) +
    geom_density() +
    xlab("Weak Liberal                    Strong Liberal")+
    ggtitle(`topic`) + ylim(0, 0.5)
  plots_D[[i]] <- p
}


plot_grid(plotlist=plots_D, ncol=3, align='v')+
  ggsave(file="Analysis/Oct_figures_3groups/density_D.pdf", 
         width=12, height=8, dpi=10000)


## (c) R
plots_R <- list()
con_data <- subset(data, ideo == "Conservative")
i=0
for (topic in unique(con_data$recode)) {
  i = i +1
  theme_update(plot.title = element_text(hjust = 0.5))
  this_topic <- subset(con_data, recode == `topic`)
  p <- ggplot(this_topic, aes(x=Rep)) +
    geom_density() +
    xlab("")+
    xlab("Weak Conservative                    Strong Conservative")+
    ggtitle(`topic`) + ylim(0, 0.5)
  plots_R[[i]] <- p
}


plot_grid(plotlist=plots_R, ncol=3, align='v')+
  ggsave(file="Analysis/Oct_figures_3groups/density_R.pdf", 
         width=12, height=8, dpi=10000)

## (d) (D-R)/(D+R)
plots_score <- list()
i=0
for (topic in unique(data$recode)) {
  i = i +1
  theme_update(plot.title = element_text(hjust = 0.5))
  this_topic <- subset(data, recode == `topic`)
  p <- ggplot(this_topic, aes(x=partisan_DminusR/partisan_DplusR)) +
    geom_density() +
    xlab("")+
    xlab("Partisan Score")+
    ggtitle(`topic`) + ylim(0, 0.5)
  plots_score[[i]] <- p
}


plot_grid(plotlist=plots_score, ncol=3, align='v')+
  ggsave(file="Analysis/Oct_figures_3groups/density_score.pdf", 
         width=12, height=8, dpi=10000)




# (1a) Conservative vs Moderate
test_results <- as.data.frame(matrix(nrow = 1, ncol = 5))
colnames(test_results) <- c("Topic", "Prop. Con.", "Prop. Mod.",
                            "CI of Diff.", "P-value")

for(i in unique(table$Substantive_Topic)){
  this_topic <- subset(table, Substantive_Topic==i)
  results <- prop.test(x = this_topic$Number[1:2], n = c(conservative_n, moderate_n))
  new.row <- c(i, format(round(results$estimate[1],3), nsmall=3), 
               format(round(results$estimate[2],3),nsmall=3), 
               paste("[",format(round(results$conf.int[1],3),nsmall=3), ",", format(round(results$conf.int[2], 3),nsmall=3), "]"), 
               format(round(results$p.value, 3),nsmall=3))
  test_results <- rbind(test_results,new.row)
}

library(xtable)
print(xtable(test_results[-1,]), include.rownames=FALSE)


# coef plot with CI
coef_results <- as.data.frame(matrix(nrow = 1, ncol = 4))
colnames(coef_results) <- c("Topic", "Difference", "Lower.CI", "Upper.CI")


for(i in unique(table$Substantive_Topic)){
  this_topic <- subset(table, Substantive_Topic==i)
  results <- prop.test(x = this_topic$Number[1:2], n = c(conservative_n, moderate_n))
  new.row <- c(i, round(results$estimate[1]-results$estimate[2], 10),
               round(results$conf.int[1], 10), round(results$conf.int[2],10))
  coef_results <- rbind(coef_results,new.row)
}

coef_results <- coef_results[-1,]
coef_results$Difference <- as.numeric(coef_results$Difference)
coef_results$Lower.CI <- as.numeric(coef_results$Lower.CI)
coef_results$Upper.CI <- as.numeric(coef_results$Upper.CI)

ggplot(coef_results, aes(Difference, Topic, label=Topic)) +        # ggplot2 plot with confidence intervals
  geom_point(size=0.5) +
  geom_linerange(aes(xmin = Lower.CI, xmax = Upper.CI), size=0.5) +
  theme_bw() +
  ylab("") +
  xlab("\n Moderate Tweets              Proportion Tweet Difference              Conservative Tweets") +
  scale_x_continuous(n.breaks = 10,   limits = c(-0.2, 0.2)) +
  geom_vline(xintercept = 0, linetype="dotted", 
             color = "black", size=0.7) +
  ggsave(file="Analysis/Oct_figures_3groups/coef_conVSmod.pdf", 
         width=12, height=6, dpi=10000)



# (1b) Liberal vs Moderate
test_results <- as.data.frame(matrix(nrow = 1, ncol = 5))
colnames(test_results) <- c("Topic", "Prop. Mod.", "Prop. Lib.",
                            "CI of Diff.", "P-value")

for(i in unique(table$Substantive_Topic)){
  this_topic <- subset(table, Substantive_Topic==i)
  results <- prop.test(x = this_topic$Number[2:3], n = c(moderate_n, liberal_n))
  new.row <- c(i, format(round(results$estimate[1],3), nsmall=3), 
               format(round(results$estimate[2],3),nsmall=3), 
               paste("[",format(round(results$conf.int[1],3),nsmall=3), ",", format(round(results$conf.int[2], 3),nsmall=3), "]"), 
               format(round(results$p.value, 3),nsmall=3))
  test_results <- rbind(test_results,new.row)
}

library(xtable)
print(xtable(test_results[-1,]), include.rownames=FALSE)


# coef plot with CI
coef_results <- as.data.frame(matrix(nrow = 1, ncol = 4))
colnames(coef_results) <- c("Topic", "Difference", "Lower.CI", "Upper.CI")


for(i in unique(table$Substantive_Topic)){
  this_topic <- subset(table, Substantive_Topic==i)
  results <- prop.test(x = this_topic$Number[2:3], n = c(moderate_n, liberal_n))
  new.row <- c(i, round(results$estimate[1]-results$estimate[2], 10),
               round(results$conf.int[1], 10), round(results$conf.int[2],10))
  coef_results <- rbind(coef_results,new.row)
}

coef_results <- coef_results[-1,]
coef_results$Difference <- as.numeric(coef_results$Difference)
coef_results$Lower.CI <- as.numeric(coef_results$Lower.CI)
coef_results$Upper.CI <- as.numeric(coef_results$Upper.CI)

ggplot(coef_results, aes(Difference, Topic, label=Topic)) +        # ggplot2 plot with confidence intervals
  geom_point(size=0.5) +
  geom_linerange(aes(xmin = Lower.CI, xmax = Upper.CI), size=0.5) +
  theme_bw() +
  ylab("") +
  xlab("\n Liberal Tweets              Proportion Tweet Difference              Moderate Tweets") +
  scale_x_continuous(n.breaks = 10,   limits = c(-0.2, 0.2)) +
  geom_vline(xintercept = 0, linetype="dotted", 
             color = "black", size=0.7) +
  ggsave(file="Analysis/Oct_figures_3groups/coef_libVSmod.pdf", 
         width=12, height=6, dpi=10000)




# (1c) Conservative vs Liberal
test_results <- as.data.frame(matrix(nrow = 1, ncol = 5))
colnames(test_results) <- c("Topic", "Prop. Con.", "Prop. Lib.",
                            "CI of Diff.", "P-value")

for(i in unique(table$Substantive_Topic)){
  this_topic <- subset(table, Substantive_Topic==i)
  results <- prop.test(x = this_topic$Number[c(1,3)], n = c(conservative_n, liberal_n))
  new.row <- c(i, format(round(results$estimate[1],3), nsmall=3), 
               format(round(results$estimate[2],3),nsmall=3), 
               paste("[",format(round(results$conf.int[1],3),nsmall=3), ",", format(round(results$conf.int[2], 3),nsmall=3), "]"), 
               format(round(results$p.value, 3),nsmall=3))
  test_results <- rbind(test_results,new.row)
}

library(xtable)
print(xtable(test_results[-1,]), include.rownames=FALSE)


# coef plot with CI
coef_results <- as.data.frame(matrix(nrow = 1, ncol = 4))
colnames(coef_results) <- c("Topic", "Difference", "Lower.CI", "Upper.CI")


for(i in unique(table$Substantive_Topic)){
  this_topic <- subset(table, Substantive_Topic==i)
  results <- prop.test(x = this_topic$Number[c(1,3)], n = c(conservative_n, liberal_n))
  new.row <- c(i, round(results$estimate[1]-results$estimate[2], 10),
               round(results$conf.int[1], 10), round(results$conf.int[2],10))
  coef_results <- rbind(coef_results,new.row)
}

coef_results <- coef_results[-1,]
coef_results$Difference <- as.numeric(coef_results$Difference)
coef_results$Lower.CI <- as.numeric(coef_results$Lower.CI)
coef_results$Upper.CI <- as.numeric(coef_results$Upper.CI)

ggplot(coef_results, aes(Difference, Topic, label=Topic)) +        # ggplot2 plot with confidence intervals
  geom_point(size=0.5) +
  geom_linerange(aes(xmin = Lower.CI, xmax = Upper.CI), size=0.5) +
  theme_bw() +
  ylab("") +
  xlab("\n Liberal Tweets              Proportion Tweet Difference              Conservative Tweets") +
  scale_x_continuous(n.breaks = 10,   limits = c(-0.2, 0.2)) +
  geom_vline(xintercept = 0, linetype="dotted", 
             color = "black", size=0.7) +
  ggsave(file="Analysis/Oct_figures_3groups/coef_conVSlib.pdf", 
         width=12, height=6, dpi=10000)




######################################################################
###############          Chisq Test                  #################
######################################################################

# (1) Chisq Test for substantive topics by ideology group

table_agg_ideo <- table(data$recode, data$ideo)
table_agg_ideo <- as.data.frame(table_agg_ideo)


table_agg_ideo <- reshape(table_agg_ideo, idvar = "Var2", timevar = "Var1", direction = "wide")
rownames(table_agg_ideo) <- c("Conservative", "Liberal", "Moderate")
table_agg_ideo<- table_agg_ideo[, -1]

chisq.test(table_agg_ideo)


# (2) Chisq Test for substantive topics by attentiveness

table_agg_atten <- table(data$recode, data$atten)
table_agg_atten <- as.data.frame(table_agg_atten)

table_agg_atten <- reshape(table_agg_atten, idvar = "Var2", timevar = "Var1", direction = "wide")
rownames(table_agg_atten) <- c("High", "Low")
table_agg_atten<- table_agg_atten[, -1]

chisq.test(table_agg_atten)



# (3) Chisq Test for substantive topics by attentiveness and ideology

table_agg_int <- table(data$recode, data$int)
table_agg_int <- as.data.frame(table_agg_int)

table_agg_int <- reshape(table_agg_int, idvar = "Var2", timevar = "Var1", direction = "wide")
rownames(table_agg_int) <- c("High Conservative", "High Liberal",      
                               "High Moderate",     "Low Conservative",
                               "Low Liberal",       "Low Moderate")
table_agg_int<- table_agg_int[, -1]

chisq.test(table_agg_int)




###########################################################################
###################       proportion test table          ##################
###########################################################################

# (1a) for six groups
test_results <- as.data.frame(matrix(nrow = 1, ncol = 8))
colnames(test_results) <- c("Topic", "Prop. High Con.", "Prop. Low Con.",
                            "Prop. High Mod.", "Prop. Low Mod.",
                            "Prop. High Lib.", "Prop. Low Lib.", 
                            "P-value")


high_conservative_n <- sum(subset(table, User=="High Conservative")$Number)
low_conservative_n <- sum(subset(table, User=="Low Conservative")$Number)

high_moderate_n <- sum(subset(table, User=="High Moderate")$Number)
low_moderate_n <- sum(subset(table, User=="Low Moderate")$Number)

high_liberal_n <- sum(subset(table, User=="High Liberal")$Number)
low_liberal_n <- sum(subset(table, User=="Low Liberal")$Number)


for(i in unique(table$Substantive_Topic)){
  this_topic <- subset(table, Substantive_Topic==i)
  results <- prop.test(x = this_topic$Number, n = c(high_conservative_n, low_conservative_n,
                                                    high_moderate_n, low_moderate_n,
                                                    high_liberal_n, low_liberal_n))
  new.row <- c(i, format(round(results$estimate[1],3), nsmall=3), 
               format(round(results$estimate[2],3),nsmall=3), 
               format(round(results$estimate[3],3),nsmall=3), 
               format(round(results$estimate[4],3),nsmall=3), 
               format(round(results$estimate[5],3),nsmall=3), 
               format(round(results$estimate[6],3),nsmall=3), 
               format(round(results$p.value, 3),nsmall=3))
  test_results <- rbind(test_results,new.row)
}

library(xtable)
print(xtable(test_results[-1,]), include.rownames=FALSE)






# (2a) for two groups -- High Conservative vs Low Conservative
test_results <- as.data.frame(matrix(nrow = 1, ncol = 5))
colnames(test_results) <- c("Topic", "Prop. High Con.", "Prop. Low Con.",
                            "CI of Diff.", "P-value")

for(i in unique(table$Substantive_Topic)){
  this_topic <- subset(table, Substantive_Topic==i)
  results <- prop.test(x = this_topic$Number[1:2], n = c(high_conservative_n, low_conservative_n))
  new.row <- c(i, format(round(results$estimate[1],3), nsmall=3), 
               format(round(results$estimate[2],3),nsmall=3), 
               paste("[",format(round(results$conf.int[1],3),nsmall=3), ",", format(round(results$conf.int[2], 3),nsmall=3), "]"), 
               format(round(results$p.value, 3),nsmall=3))
  test_results <- rbind(test_results,new.row)
}

library(xtable)
print(xtable(test_results[-1,]), include.rownames=FALSE)


# coef plot with CI
coef_results <- as.data.frame(matrix(nrow = 1, ncol = 4))
colnames(coef_results) <- c("Topic", "Difference", "Lower.CI", "Upper.CI")


for(i in unique(table$Substantive_Topic)){
  this_topic <- subset(table, Substantive_Topic==i)
  results <- prop.test(x = this_topic$Number[1:2], n = c(high_conservative_n, low_conservative_n))
  new.row <- c(i, round(results$estimate[1]-results$estimate[2], 10),
               round(results$conf.int[1], 10), round(results$conf.int[2],10))
  coef_results <- rbind(coef_results,new.row)
}

coef_results <- coef_results[-1,]
coef_results$Difference <- as.numeric(coef_results$Difference)
coef_results$Lower.CI <- as.numeric(coef_results$Lower.CI)
coef_results$Upper.CI <- as.numeric(coef_results$Upper.CI)

ggplot(coef_results, aes(Difference, Topic, label=Topic)) +        # ggplot2 plot with confidence intervals
  geom_point(size=0.5) +
  geom_linerange(aes(xmin = Lower.CI, xmax = Upper.CI), size=0.5) +
  theme_bw() +
  ylab("") +
  xlab("\nLow Conservative Tweets              Proportion Tweet Difference              High Conservative Tweets") +
  scale_x_continuous(n.breaks = 10,   limits = c(-0.04, 0.04)) +
  geom_vline(xintercept = 0, linetype="dotted", 
             color = "black", size=0.7) +
  ggsave(file="Analysis/Oct_figures_remove/coef_conser.pdf", 
         width=12, height=6, dpi=10000)


# (2b) for two groups -- High Moderate vs Low Moderate
test_results <- as.data.frame(matrix(nrow = 1, ncol = 5))
colnames(test_results) <- c("Topic", "Prop. High Mod.", "Prop. Low Mod.",
                            "CI of Diff.", "P-value")

for(i in unique(table$Substantive_Topic)){
  this_topic <- subset(table, Substantive_Topic==i)
  results <- prop.test(x = this_topic$Number[3:4], n = c(high_moderate_n, low_moderate_n))
  new.row <- c(i, format(round(results$estimate[1],3), nsmall=3), 
               format(round(results$estimate[2],3),nsmall=3), 
               paste("[",format(round(results$conf.int[1],3),nsmall=3), ",", format(round(results$conf.int[2], 3),nsmall=3), "]"), 
               format(round(results$p.value, 3),nsmall=3))
  test_results <- rbind(test_results,new.row)
}

library(xtable)
print(xtable(test_results[-1,]), include.rownames=FALSE)


# coef plot with CI
coef_results <- as.data.frame(matrix(nrow = 1, ncol = 4))
colnames(coef_results) <- c("Topic", "Difference", "Lower.CI", "Upper.CI")


for(i in unique(table$Substantive_Topic)){
  this_topic <- subset(table, Substantive_Topic==i)
  results <- prop.test(x = this_topic$Number[3:4], n = c(high_moderate_n, low_moderate_n))
  new.row <- c(i, round(results$estimate[1]-results$estimate[2], 10),
               round(results$conf.int[1], 10), round(results$conf.int[2],10))
  coef_results <- rbind(coef_results,new.row)
}

coef_results <- coef_results[-1,]
coef_results$Difference <- as.numeric(coef_results$Difference)
coef_results$Lower.CI <- as.numeric(coef_results$Lower.CI)
coef_results$Upper.CI <- as.numeric(coef_results$Upper.CI)

ggplot(coef_results, aes(Difference, Topic, label=Topic)) +        # ggplot2 plot with confidence intervals
  geom_point(size=0.5) +
  geom_linerange(aes(xmin = Lower.CI, xmax = Upper.CI), size=0.5) +
  theme_bw() +
  ylab("") +
  xlab("\nLow Moderate Tweets              Proportion Tweet Difference              High Moderate Tweets") +
  scale_x_continuous(n.breaks = 10,   limits = c(-0.04, 0.04)) +
  geom_vline(xintercept = 0, linetype="dotted", 
             color = "black", size=0.7) +
  ggsave(file="Analysis/Oct_figures_remove/coef_mod.pdf", 
         width=12, height=6, dpi=10000)


# (2c) for two groups -- High Liberal vs Low Liberal
test_results <- as.data.frame(matrix(nrow = 1, ncol = 5))
colnames(test_results) <- c("Topic", "Prop. High Lib.", "Prop. Low Lib.",
                            "CI of Diff.", "P-value")

for(i in unique(table$Substantive_Topic)){
  this_topic <- subset(table, Substantive_Topic==i)
  results <- prop.test(x = this_topic$Number[5:6], n = c(high_liberal_n, low_liberal_n))
  new.row <- c(i, format(round(results$estimate[1],3), nsmall=3), 
               format(round(results$estimate[2],3),nsmall=3), 
               paste("[",format(round(results$conf.int[1],3),nsmall=3), ",", format(round(results$conf.int[2], 3),nsmall=3), "]"), 
               format(round(results$p.value, 3),nsmall=3))
  test_results <- rbind(test_results,new.row)
}

library(xtable)
print(xtable(test_results[-1,]), include.rownames=FALSE)


# coef plot with CI
coef_results <- as.data.frame(matrix(nrow = 1, ncol = 4))
colnames(coef_results) <- c("Topic", "Difference", "Lower.CI", "Upper.CI")


for(i in unique(table$Substantive_Topic)){
  this_topic <- subset(table, Substantive_Topic==i)
  results <- prop.test(x = this_topic$Number[5:6], n = c(high_liberal_n, low_liberal_n))
  new.row <- c(i, round(results$estimate[1]-results$estimate[2], 10),
               round(results$conf.int[1], 10), round(results$conf.int[2],10))
  coef_results <- rbind(coef_results,new.row)
}

coef_results <- coef_results[-1,]
coef_results$Difference <- as.numeric(coef_results$Difference)
coef_results$Lower.CI <- as.numeric(coef_results$Lower.CI)
coef_results$Upper.CI <- as.numeric(coef_results$Upper.CI)

ggplot(coef_results, aes(Difference, Topic, label=Topic)) +        # ggplot2 plot with confidence intervals
  geom_point(size=0.5) +
  geom_linerange(aes(xmin = Lower.CI, xmax = Upper.CI), size=0.5) +
  theme_bw() +
  ylab("") +
  xlab("\nLow Liberal Tweets              Proportion Tweet Difference              High Liberal Tweets") +
  scale_x_continuous(n.breaks = 10,   limits = c(-0.04, 0.04)) +
  geom_vline(xintercept = 0, linetype="dotted", 
             color = "black", size=0.7) +
  ggsave(file="Analysis/Oct_figures_remove/coef_lib.pdf", 
         width=12, height=6, dpi=10000)


